Source Code of org.terrier.structures.indexing.DocumentPostingList$postingIterator

/*
 * Terrier - Terabyte Retriever
 * Webpage: http://terrier.org
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.uk
 *
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is DocumentPostingList.java.
 *
 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
 *   
 */


package org.terrier.structures.indexing;


import gnu.trove.TObjectIntHashMap;
import gnu.trove.TObjectIntProcedure;


import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;


import org.terrier.sorting.HeapSortInt;
import org.terrier.structures.BasicDocumentIndexEntry;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.postings.BasicPostingImpl;
import org.terrier.structures.postings.IterablePosting;
import org.terrier.structures.postings.IterablePostingImpl;
import org.terrier.structures.postings.WritablePosting;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.TermCodes;
/** Represents the postings of one document. Uses HashMaps internally.
  * <p>
  * <b>Properties:</b><br>
  * <ul><li><tt>indexing.avg.unique.terms.per.doc</tt> - number of unique terms per doc on average, used to tune the initial 
  * size of the hashmaps used in this class.</li></ul>
  */
public class DocumentPostingList {
  /** number of unique terms per doc on average, used to tune the initial size of the hashmaps used in this class. */
  protected static final int AVG_DOCUMENT_UNIQUE_TERMS =
    Integer.parseInt(ApplicationSetup.getProperty("indexing.avg.unique.terms.per.doc", "120"));


  /** length of the document so far. Sum of the term frequencies inserted so far. */
  protected int documentLength = 0;


  /** mapping term to tf mapping */  
  protected final TObjectIntHashMap<String> occurrences = new TObjectIntHashMap<String>(AVG_DOCUMENT_UNIQUE_TERMS);
  
  /** Create a new DocumentPostingList object */
  public DocumentPostingList()
  {}
  
  /** Returns all terms in this posting list */
  public String[] termSet()
  {
    return occurrences.keys(new String[0]);
  }
  
  /** Return the frequency of the specified term in this document */
  public int getFrequency(String term)
  {
    return occurrences.get(term);
  }  


  /** Removes all postings from this document */
  public void clear()
  {
    occurrences.clear();
    documentLength = 0;
  }


  /** Returns the total number of tokens in this document */  
  public int getDocumentLength()
  {
    return documentLength;
  }


  /** Returns the number of unique terms in this document. */
  public int getNumberOfPointers()
  {
    return occurrences.size();
  }
  /** Insert a term into the posting list of this document 
    * @param term the Term being inserted */
  public void insert(final String term)
  {
    occurrences.adjustOrPutValue(term,1,1);
    documentLength++;
  }
  
  /** Insert a term into the posting list of this document
    * @param tf frequency
      * @param term the Term being inserted */
    public void insert(final int tf, final String term)
    {
        occurrences.adjustOrPutValue(term,tf,tf);
        documentLength++;
    }


    /** Return a DocumentIndexEntry for this document */ 
    public DocumentIndexEntry getDocumentStatistics()
  {
    DocumentIndexEntry die = new BasicDocumentIndexEntry();
    die.setDocumentLength(this.getDocumentLength());
    die.setNumberOfEntries(this.getNumberOfPointers());
    return die;
  }
    
    /** Execute the specifed method for each term. */
    public void forEachTerm(TObjectIntProcedure<String> proc)
    {
      this.occurrences.forEachEntry(proc);
    }
    
    /** Used by getPostings() and getPostings2() to obtain the term id of the term.
     * This implementation uses the TermCodes class. */
    protected int getTermId(String term)
    {
      return TermCodes.getCode(term);
    }


  /** Returns the postings suitable to be written into the direct index.
   * During this, TermIds are assigned. */
  public int[][] getPostings()
  {
    final int termCount = occurrences.size();
    final int[] termids = new int[termCount];
    final int[] tfs = new int[termCount];
    occurrences.forEachEntry( new TObjectIntProcedure<String>() { 
      int i=0;
      public boolean execute(final String a, final int b)
      {
        termids[i] = getTermId(a);
        tfs[i++] = b;
        return true;
      }
    });
    HeapSortInt.ascendingHeapSort(termids, tfs);
    return new int[][]{termids, tfs};
  }
  
  /** Returns a posting iterator suitable to be written into the direct index.
   * During this, TermIds are assigned, using getTermId() method. */
  public IterablePosting getPostings2()
  {
    //obtain and sort termids by id
    
    final int termCount = occurrences.size();
    final TObjectIntHashMap<String> cache_termids = new TObjectIntHashMap<String>(termCount);  
    
    occurrences.forEachEntry( new TObjectIntProcedure<String>() { 
      public boolean execute(final String a, final int b)
      {
        cache_termids.put(a, getTermId(a));
        return true;
      }
    });
    
    final String[] terms = cache_termids.keys(new String[termCount]);
    Arrays.sort(terms, new Comparator<String>(){
      public int compare(String o1, String o2) {
        return cache_termids.get(o1) - cache_termids.get(o2);
      }      
    });
    final int[] termIds = new int[termCount];
    int i=0;
    for(String t : terms)
    {
      termIds[i++] = cache_termids.get(t);
    }
    return makePostingIterator(terms, termIds);
  }
  
  protected IterablePosting makePostingIterator(String[] _terms, int[] termIds)
  {
    return new postingIterator(_terms, termIds);
  }
  
  protected class postingIterator extends IterablePostingImpl
  {
    String[] terms;
    int[] termIds;
    int i = -1;
    
    public postingIterator(String[] _terms, int[] _termIds)
    {
      terms = _terms;
      termIds = _termIds;
    }
    
    public WritablePosting asWritablePosting() {
      return new BasicPostingImpl(termIds[i], getFrequency());
    }


    public int getDocumentLength() {
      return documentLength;
    }


    public int getFrequency() {
      return occurrences.get(terms[i]);
    }


    public int getId() {
      return termIds[i];
    }


    public void setId(int id) {
      termIds[i] = id;
    }


    public int next() throws IOException {
      if (i >= termIds.length -1)
        return EOL;
      i++;
      return termIds[i];
    }
    
    /** {@inheritDoc} */
    public boolean endOfPostings() {
      return (i >= termIds.length -1);
    }


    public void close() throws IOException {
      terms = null;
      termIds = null;
    }
  }
  
}
Source Code of org.terrier.structures.indexing.DocumentPostingList$postingIterator

Related Classes of org.terrier.structures.indexing.DocumentPostingList$postingIterator